
"""
Fixes misextracted links in the revisions table.  Some of the links
mistakenly include symbols like " and > and | which were not separated
by whitespace in the wikitext.  These sybols act as delimiters and
everything after them should not be included in the links.

"""
# $Id: fix_links.py 24 2011-07-31 23:28:07Z postshift@gmail.com $
import re
import sys
import urlparse
import traceback
import wikinbitDb

# the list of possible terminators in wikipedia text appears to be:
terminators = '''\s|"}\],<'''

updates = []
cur = wikinbitDb.conn.cursor()
cur.execute("select urlName, revisionId, lineNumber, refUrl from revisions where refUrl is not null")
for (urlName, revisionId, lineNumber, refUrl) in cur:
    # extract the URL up to the end of the string or a terminator:
    m = re.match(u'''((http|ftp).*?)([%s]|$)''' % terminators, refUrl)
    try:
        newRefUrl = m.group(1)
        o = urlparse.urlsplit(newRefUrl)
        newRefUrl = o.geturl()
        newRefUrl = newRefUrl.decode("utf8").encode("utf8")
    except Exception, exc:
        print traceback.format_exc(exc)
        sys.exit("This refUrl failed:\t" + repr(refUrl))
    if newRefUrl == refUrl:
        #print "# no change"
        pass
    elif len(newRefUrl) > len(refUrl):
        sys.exit("How could we ever get a longer URL?\n\t%s\n\t--> %s" % (refUrl, newRefUrl))
    else:
        print "Will change:\n\t%s\n\t--> %s" % (refUrl, newRefUrl)

    updates.append({
            'urlName': urlName, 
            'revisionId': revisionId,
            'lineNumber': lineNumber, 
            'refUrl': newRefUrl
            })


this script has never been run, I founda  bug in the original when_linked file that put the revisionTime in the lineNumber field, thus rendering the enter table useless.  Dropping table and recreating it with the links properly extracted.

print "Attempting %d updates" % len(updates)
try:
    cur.executemany('''UPDATE revisions SET refUrl = %(refUrl)s WHERE urlName = %(urlName)s AND revisionId = %(revisionId)s and lineNumber = %(lineNumber)s)''', updates)
    except Exception, exc:
        sys.exit(traceback.format_exc(exc))

wikinbitDb.conn.commit()

print "Done updating %d links" % len(updates)
wikinbitDb.conn.close()
